Biostat 212a Homework 4

Due Mar. 5, 2024 @ 11:59PM

Author

Wenqing Mao, UID:806332971

Published

March 4, 2024

1 ISL Exercise 8.4.3 (10pts)

library(tidyverse)

p1 <- seq(0, 1, 0.01)
p2 <- 1 - p1

# define the function of classification error
classification_error <- 1 - pmax(p1, p2)

# define the function of Gini index
gini <- p1*(1-p1) + p2*(1-p2)

# define the function of entropy
entropy <- - p1*log(p1) - p2*log(p2)

data.frame(p1, p2, classification_error, gini, entropy) %>%
  pivot_longer(cols = c(classification_error, gini, entropy), names_to = "metrics") %>%
  ggplot(aes(x = p1, y = value, col = factor(metrics))) + 
  geom_line() + 
  scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
  scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
  labs(col = "Metrics", 
       y = "Value", 
       x = "P1")
Warning: Removed 2 rows containing missing values (`geom_line()`).

2 ISL Exercise 8.4.4 (10pts)

Here is my answer to this question: Answer

3 ISL Exercise 8.4.5 (10pts)

Method one: Majority vote approach

probs <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)

cat("The number of red observations is: ", sum(probs >= 0.5), "\n")
The number of red observations is:  6 
cat("The number of green observations is: ", sum(probs < 0.5))
The number of green observations is:  4

The final classification under the majority vote approach is red.

Method two: Average probability approach

mean(probs)
[1] 0.45

The average probability is 0.45, which is less than 0.5. The final classification under the average probability approach is green.

4 ISL Lab 8.3. Boston data set (30pts)

Follow the machine learning workflow to train regression tree, random forest, and boosting methods for predicting medv. Evaluate out-of-sample performance on a test set.

Answer:

Regression tree

Load and summary data

library(GGally) 
library(tidyverse)
library(tidymodels)
library(ISLR2)

# Load the Boston data set
data(Boston)

# Summary of the data
summary(Boston)
      crim                zn             indus            chas        
 Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
 1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
 Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
 Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
 3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
 Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
      nox               rm             age              dis        
 Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
 1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
 Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
 Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
 3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
 Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
      rad              tax           ptratio          lstat      
 Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
 1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
 Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
 Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
 3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
 Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
      medv      
 Min.   : 5.00  
 1st Qu.:17.02  
 Median :21.20  
 Mean   :22.53  
 3rd Qu.:25.00  
 Max.   :50.00  
ggpairs(Boston, lower=list(continuous=wrap("points", alpha=0.3, size=0.3)),
        diag=list(continuous='barDiag')) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 3)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 3))

Split the data into training and test sets

set.seed(123)
data_split <- initial_split(
  Boston, 
  prop = 0.75
  )

Boston_train <- training(data_split)
dim(Boston_train)
[1] 379  13
Boston_test <- testing(data_split)
dim(Boston_test)
[1] 127  13

Build recipe

rec <- recipe(medv ~ ., data = Boston_train) %>%
  step_dummy(all_nominal()) %>%
  step_normalize(all_numeric_predictors())

Train regression tree

Regressuib tree model

regtree_mod <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = 10,
  mode = "regression",
  engine = "rpart"
  ) 

Set workflow

regtree_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(regtree_mod)
regtree_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = tune()
  tree_depth = tune()
  min_n = 10

Computational engine: rpart 

Turning grid

regtree_grid <- grid_regular(cost_complexity(range = c(-10, -2)), # set tuning range
                          tree_depth(range = c(1, 10)),
                          levels = c(100, 5))

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Boston_train, v = 10)
folds
#  10-fold cross-validation 
# A tibble: 10 × 2
   splits           id    
   <list>           <chr> 
 1 <split [341/38]> Fold01
 2 <split [341/38]> Fold02
 3 <split [341/38]> Fold03
 4 <split [341/38]> Fold04
 5 <split [341/38]> Fold05
 6 <split [341/38]> Fold06
 7 <split [341/38]> Fold07
 8 <split [341/38]> Fold08
 9 <split [341/38]> Fold09
10 <split [342/37]> Fold10

Fit cross-validation

regtree_fit <- regtree_wf %>%
  tune_grid(
    resamples = folds,
    grid = regtree_grid,
    metrics = metric_set(rmse, rsq)
    )
regtree_fit
# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics             .notes          
   <list>           <chr>  <list>               <list>          
 1 <split [341/38]> Fold01 <tibble [1,000 × 6]> <tibble [0 × 3]>
 2 <split [341/38]> Fold02 <tibble [1,000 × 6]> <tibble [0 × 3]>
 3 <split [341/38]> Fold03 <tibble [1,000 × 6]> <tibble [0 × 3]>
 4 <split [341/38]> Fold04 <tibble [1,000 × 6]> <tibble [0 × 3]>
 5 <split [341/38]> Fold05 <tibble [1,000 × 6]> <tibble [0 × 3]>
 6 <split [341/38]> Fold06 <tibble [1,000 × 6]> <tibble [0 × 3]>
 7 <split [341/38]> Fold07 <tibble [1,000 × 6]> <tibble [0 × 3]>
 8 <split [341/38]> Fold08 <tibble [1,000 × 6]> <tibble [0 × 3]>
 9 <split [341/38]> Fold09 <tibble [1,000 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [1,000 × 6]> <tibble [0 × 3]>

Visualize CV results

regtree_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "rmse") %>%
  mutate(tree_depth = as.factor(tree_depth)) %>%
  ggplot(mapping = aes(x = cost_complexity, y = mean, color = tree_depth)) +
  geom_point() + 
  geom_line() + 
  labs(x = "Cost Complexity", y = "CV RMSE")
# A tibble: 1,000 × 8
   cost_complexity tree_depth .metric .estimator  mean     n std_err
             <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl>
 1        1   e-10          1 rmse    standard   7.54     10  0.265 
 2        1   e-10          1 rsq     standard   0.330    10  0.0352
 3        1.20e-10          1 rmse    standard   7.54     10  0.265 
 4        1.20e-10          1 rsq     standard   0.330    10  0.0352
 5        1.45e-10          1 rmse    standard   7.54     10  0.265 
 6        1.45e-10          1 rsq     standard   0.330    10  0.0352
 7        1.75e-10          1 rmse    standard   7.54     10  0.265 
 8        1.75e-10          1 rsq     standard   0.330    10  0.0352
 9        2.10e-10          1 rmse    standard   7.54     10  0.265 
10        2.10e-10          1 rsq     standard   0.330    10  0.0352
   .config               
   <chr>                 
 1 Preprocessor1_Model001
 2 Preprocessor1_Model001
 3 Preprocessor1_Model002
 4 Preprocessor1_Model002
 5 Preprocessor1_Model003
 6 Preprocessor1_Model003
 7 Preprocessor1_Model004
 8 Preprocessor1_Model004
 9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 990 more rows

Finalize regression tree model

regtree_fit %>%
  show_best("rmse")
# A tibble: 5 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1        0.00129          10 rmse    standard    4.30    10   0.579 Preprocesso…
2        0.00129           7 rmse    standard    4.31    10   0.591 Preprocesso…
3        0.000242          7 rmse    standard    4.33    10   0.582 Preprocesso…
4        0.000138          7 rmse    standard    4.33    10   0.582 Preprocesso…
5        0.000167          7 rmse    standard    4.33    10   0.582 Preprocesso…
best_regtree <- regtree_fit %>%
  select_best("rmse")
best_regtree
# A tibble: 1 × 3
  cost_complexity tree_depth .config               
            <dbl>      <int> <chr>                 
1         0.00129         10 Preprocessor1_Model489

Final workflow

final_regtree_wf <- regtree_wf %>%
  finalize_workflow(best_regtree)
final_regtree_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = 0.00129154966501489
  tree_depth = 10
  min_n = 10

Computational engine: rpart 

Fit the whole training set, then predict the test cases

final_regtree_fit <- 
  final_regtree_wf %>%
  last_fit(data_split)

# Test metrics
final_regtree_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       4.43  Preprocessor1_Model1
2 rsq     standard       0.783 Preprocessor1_Model1

Visualize the final model

library(rpart.plot)
final_regtree <- extract_workflow(final_regtree_fit)
final_regtree %>%
  extract_fit_engine() %>%
  rpart.plot(roundint = FALSE)

library(vip)

final_regtree %>% 
  extract_fit_parsnip() %>% 
  vip()

Summary: The final regression tree model has depth of 10 and cost complexity of 0.0013. The 3 most important variables are rm, lstat and dis, which means that the average number of rooms per dwelling, lower status of the population, and weighted distances to five Boston employment centers are the most important predictors to predict medv. The model estimate new medv values using the average values of medv in the terminal nodes(leaves) and has a relatively good performance with RMSE of 4.42 and R-squared of 0.78.

Train random forest

Random forest model

rf_mod <- 
  rand_forest(
    mode = "regression",
    # Number of predictors randomly sampled in each split
    mtry = tune(),
    # Number of trees in ensemble
    trees = tune()
  ) %>% 
  set_engine("ranger")
rf_mod
Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  trees = tune()

Computational engine: ranger 

Set workflow

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_mod)
rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  trees = tune()

Computational engine: ranger 

Tuning grid

rf_grid <- grid_regular(
  trees(range = c(100L, 500L)), 
  mtry(range = c(2L, 6L)),
  levels = c(10, 5)
  )

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Boston_train, v = 10)

Fit cross-validation

library(ranger)

rf_fit <- rf_wf %>%
  tune_grid(
    resamples = folds,
    grid = rf_grid,
    metrics = metric_set(rmse, rsq)
    )
rf_fit
# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics           .notes          
   <list>           <chr>  <list>             <list>          
 1 <split [341/38]> Fold01 <tibble [100 × 6]> <tibble [0 × 3]>
 2 <split [341/38]> Fold02 <tibble [100 × 6]> <tibble [0 × 3]>
 3 <split [341/38]> Fold03 <tibble [100 × 6]> <tibble [0 × 3]>
 4 <split [341/38]> Fold04 <tibble [100 × 6]> <tibble [0 × 3]>
 5 <split [341/38]> Fold05 <tibble [100 × 6]> <tibble [0 × 3]>
 6 <split [341/38]> Fold06 <tibble [100 × 6]> <tibble [0 × 3]>
 7 <split [341/38]> Fold07 <tibble [100 × 6]> <tibble [0 × 3]>
 8 <split [341/38]> Fold08 <tibble [100 × 6]> <tibble [0 × 3]>
 9 <split [341/38]> Fold09 <tibble [100 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [100 × 6]> <tibble [0 × 3]>

Visualize CV results

rf_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "rmse") %>%
  mutate(mtry = as.factor(mtry)) %>%
  ggplot(mapping = aes(x = trees, y = mean, color = mtry)) +
  geom_point() + 
  geom_line() + 
  labs(x = "Number of Trees", y = "CV RMSE")
# A tibble: 100 × 8
    mtry trees .metric .estimator  mean     n std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1     2   100 rmse    standard   3.55     10  0.399  Preprocessor1_Model01
 2     2   100 rsq     standard   0.865    10  0.0236 Preprocessor1_Model01
 3     2   144 rmse    standard   3.51     10  0.425  Preprocessor1_Model02
 4     2   144 rsq     standard   0.867    10  0.0256 Preprocessor1_Model02
 5     2   188 rmse    standard   3.60     10  0.419  Preprocessor1_Model03
 6     2   188 rsq     standard   0.858    10  0.0250 Preprocessor1_Model03
 7     2   233 rmse    standard   3.61     10  0.435  Preprocessor1_Model04
 8     2   233 rsq     standard   0.857    10  0.0267 Preprocessor1_Model04
 9     2   277 rmse    standard   3.62     10  0.429  Preprocessor1_Model05
10     2   277 rsq     standard   0.857    10  0.0267 Preprocessor1_Model05
# ℹ 90 more rows

Finalize random forest model

rf_fit %>%
  show_best("rmse")
# A tibble: 5 × 8
   mtry trees .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     5   411 rmse    standard    3.14    10   0.308 Preprocessor1_Model38
2     5   233 rmse    standard    3.14    10   0.307 Preprocessor1_Model34
3     5   188 rmse    standard    3.15    10   0.283 Preprocessor1_Model33
4     6   500 rmse    standard    3.15    10   0.309 Preprocessor1_Model50
5     6   233 rmse    standard    3.15    10   0.294 Preprocessor1_Model44
best_rf <- rf_fit %>%
  select_best("rmse")
best_rf
# A tibble: 1 × 3
   mtry trees .config              
  <int> <int> <chr>                
1     5   411 Preprocessor1_Model38

Final workflow

final_rf_wf <- rf_wf %>%
  finalize_workflow(best_rf)
final_rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 5
  trees = 411

Computational engine: ranger 

Fit the whole training set, then predict the test cases

final_rf_fit <- 
  final_rf_wf %>%
  last_fit(data_split)
final_rf_fit
# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits            id               .metrics .notes   .predictions .workflow 
  <list>            <chr>            <list>   <list>   <list>       <list>    
1 <split [379/127]> train/test split <tibble> <tibble> <tibble>     <workflow>
# Test metrics
final_rf_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       3.61  Preprocessor1_Model1
2 rsq     standard       0.844 Preprocessor1_Model1

Summary: Random forest algorithm grow multiple decision trees, the final random forest model has 233 trees and 4 predictors randomly sampled in each split. Random forest estimate new medv values using is the average of all the tree predictions in regression problem. The model has a relatively good performance with RMSE of 3.61 and R-squared of 0.84.

Train boosting

Train boosting model

gb_mod <- 
  boost_tree(
    mode = "regression",
    trees = 1000, 
    tree_depth = tune(),
    learn_rate = tune()
  ) %>% 
  set_engine("xgboost")
gb_mod
Boosted Tree Model Specification (regression)

Main Arguments:
  trees = 1000
  tree_depth = tune()
  learn_rate = tune()

Computational engine: xgboost 

Set workflow

gb_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(gb_mod)
gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)

Main Arguments:
  trees = 1000
  tree_depth = tune()
  learn_rate = tune()

Computational engine: xgboost 

Tuning grid

gb_grid <- grid_regular(
  tree_depth(range = c(2L, 10L)),
  learn_rate(range = c(-3, -1), trans = log10_trans()),
  levels = c(5, 10)
  )
gb_grid
# A tibble: 50 × 2
   tree_depth learn_rate
        <int>      <dbl>
 1          2    0.001  
 2          4    0.001  
 3          6    0.001  
 4          8    0.001  
 5         10    0.001  
 6          2    0.00167
 7          4    0.00167
 8          6    0.00167
 9          8    0.00167
10         10    0.00167
# ℹ 40 more rows

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Boston_train, v = 10)

Fit cross-validation

library(xgboost)

gb_fit <- gb_wf %>%
  tune_grid(
    resamples = folds,
    grid = gb_grid,
    metrics = metric_set(rmse, rsq)
    )
gb_fit
# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics           .notes          
   <list>           <chr>  <list>             <list>          
 1 <split [341/38]> Fold01 <tibble [100 × 6]> <tibble [0 × 3]>
 2 <split [341/38]> Fold02 <tibble [100 × 6]> <tibble [0 × 3]>
 3 <split [341/38]> Fold03 <tibble [100 × 6]> <tibble [0 × 3]>
 4 <split [341/38]> Fold04 <tibble [100 × 6]> <tibble [0 × 3]>
 5 <split [341/38]> Fold05 <tibble [100 × 6]> <tibble [0 × 3]>
 6 <split [341/38]> Fold06 <tibble [100 × 6]> <tibble [0 × 3]>
 7 <split [341/38]> Fold07 <tibble [100 × 6]> <tibble [0 × 3]>
 8 <split [341/38]> Fold08 <tibble [100 × 6]> <tibble [0 × 3]>
 9 <split [341/38]> Fold09 <tibble [100 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [100 × 6]> <tibble [0 × 3]>

Visualize CV results

gb_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "rmse") %>%
  ggplot(mapping = aes(x = learn_rate, y = mean, color = factor(tree_depth))) +
  geom_point() +
  geom_line() +
  labs(x = "Learning Rate", y = "CV RMSE") +
  scale_x_log10()
# A tibble: 100 × 8
   tree_depth learn_rate .metric .estimator  mean     n std_err
        <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
 1          2      0.001 rmse    standard   9.94     10  0.483 
 2          2      0.001 rsq     standard   0.779    10  0.0310
 3          4      0.001 rmse    standard   9.85     10  0.446 
 4          4      0.001 rsq     standard   0.817    10  0.0287
 5          6      0.001 rmse    standard   9.86     10  0.439 
 6          6      0.001 rsq     standard   0.822    10  0.0290
 7          8      0.001 rmse    standard   9.87     10  0.442 
 8          8      0.001 rsq     standard   0.821    10  0.0297
 9         10      0.001 rmse    standard   9.87     10  0.442 
10         10      0.001 rsq     standard   0.821    10  0.0297
   .config              
   <chr>                
 1 Preprocessor1_Model01
 2 Preprocessor1_Model01
 3 Preprocessor1_Model02
 4 Preprocessor1_Model02
 5 Preprocessor1_Model03
 6 Preprocessor1_Model03
 7 Preprocessor1_Model04
 8 Preprocessor1_Model04
 9 Preprocessor1_Model05
10 Preprocessor1_Model05
# ℹ 90 more rows

Finalize boosting model

gb_fit %>%
  show_best("rmse")
# A tibble: 5 × 8
  tree_depth learn_rate .metric .estimator  mean     n std_err .config          
       <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
1          6    0.0129  rmse    standard    3.12    10   0.333 Preprocessor1_Mo…
2          6    0.0215  rmse    standard    3.12    10   0.339 Preprocessor1_Mo…
3          6    0.0359  rmse    standard    3.13    10   0.337 Preprocessor1_Mo…
4          4    0.0215  rmse    standard    3.13    10   0.305 Preprocessor1_Mo…
5          6    0.00774 rmse    standard    3.13    10   0.339 Preprocessor1_Mo…
best_gb <- gb_fit %>%
  select_best("rmse")
best_gb
# A tibble: 1 × 3
  tree_depth learn_rate .config              
       <int>      <dbl> <chr>                
1          6     0.0129 Preprocessor1_Model28

Final workflow

final_gb_wf <- gb_wf %>%
  finalize_workflow(best_gb)
final_gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)

Main Arguments:
  trees = 1000
  tree_depth = 6
  learn_rate = 0.0129154966501488

Computational engine: xgboost 

Fit the whole training set, then predict the test cases

final_gb_fit <- 
  final_gb_wf %>%
  last_fit(data_split)

# Test metrics
final_gb_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       3.71  Preprocessor1_Model1
2 rsq     standard       0.840 Preprocessor1_Model1

Summary: Gradient Boosting builds trees in a sequential manner, where each new tree aims to correct the errors made by the previous ones. The final gradient boosting model has 1000 trees, 6 levels of tree depth, and 0.01291 learning rate. The model estimates new medv values by adding trees that predict the residuals or errors of prior trees and it has a relatively good performance with RMSE of 3.71 and R-squared of 0.84.

Compare Performance

final_regtree_fit %>% 
  collect_metrics() %>%
  mutate(model = "Regression Tree") %>%
  bind_rows(
    final_rf_fit %>% collect_metrics() %>%
      mutate(model = "Random Forest"),
    final_gb_fit %>% collect_metrics() %>%
      mutate(model = "Gradient Boosting")
  ) %>%
  select(model, .metric, .estimate) %>%
  spread(.metric, .estimate)
# A tibble: 3 × 3
  model              rmse   rsq
  <chr>             <dbl> <dbl>
1 Gradient Boosting  3.71 0.840
2 Random Forest      3.61 0.844
3 Regression Tree    4.43 0.783

5 ISL Lab 8.3 Carseats data set (30pts)

Follow the machine learning workflow to train classification tree, random forest, and boosting methods for classifying Sales <= 8 versus Sales > 8. Evaluate out-of-sample performance on a test set.

Answer: Load the data and create Sales binary response variable

library(ISLR2)
library(tidyverse)
library(tidymodels)
library(gtsummary)

# Load the Carseats data set
data(Carseats)
Carseats <- Carseats %>%
  mutate(Sales_bi = as.factor(ifelse(Sales <= 8, "No", "Yes"))) %>%
  select(-Sales)
# Summary of the data
Carseats %>% 
  tbl_summary(
    by = Sales_bi, # stratify by Sales_bi
    statistic = list(all_continuous() ~ "{mean} ({sd})", 
                     all_categorical() ~ "{n} ({p}%)"), 
    missing = "ifany" 
  )
Characteristic No, N = 2361 Yes, N = 1641
CompPrice 124 (15) 126 (16)
Income 65 (29) 74 (26)
Advertising 5.0 (5.8) 9.0 (7.1)
Population 260 (147) 273 (148)
Price 123 (22) 106 (23)
ShelveLoc

    Bad 82 (35%) 14 (8.5%)
    Good 19 (8.1%) 66 (40%)
    Medium 135 (57%) 84 (51%)
Age 56 (16) 49 (15)
Education

    10 23 (9.7%) 25 (15%)
    11 31 (13%) 17 (10%)
    12 30 (13%) 19 (12%)
    13 26 (11%) 17 (10%)
    14 26 (11%) 14 (8.5%)
    15 18 (7.6%) 18 (11%)
    16 29 (12%) 18 (11%)
    17 28 (12%) 21 (13%)
    18 25 (11%) 15 (9.1%)
Urban 172 (73%) 110 (67%)
US 135 (57%) 123 (75%)
1 Mean (SD); n (%)
Carseats %>%
  select(-c(ShelveLoc, Urban, US, Sales_bi)) %>%
  ggpairs(lower=list(continuous=wrap("points", alpha=0.5, size=0.5)),
        diag=list(continuous='barDiag')) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

Split the data into train and test sets

set.seed(123)

data_split <- initial_split(
  Carseats,
  prop = 0.8,
  strata = Sales_bi
  )
data_split
<Training/Testing/Total>
<319/81/400>
Carseats_train <- training(data_split)
dim(Carseats_train)
[1] 319  11
Carseats_test <- testing(data_split)
dim(Carseats_test)
[1] 81 11

Build recipe

class_rec <- 
  recipe(
    Sales_bi ~ ., 
    data = Carseats_train
  ) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

Train classification tree

Classification tree model

classtree_mod <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = 5,
  mode = "classification",
  engine = "rpart"
  ) 

Set workflow

classtree_wf <- workflow() %>%
  add_recipe(class_rec) %>%
  add_model(classtree_mod)

Tuning grid

classtree_grid <- grid_regular(cost_complexity(range = c(-10, -1.5)),
                          tree_depth(range = c(1L, 10L)),
                          levels = c(100,5))

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Carseats_train, v = 10)
folds
#  10-fold cross-validation 
# A tibble: 10 × 2
   splits           id    
   <list>           <chr> 
 1 <split [287/32]> Fold01
 2 <split [287/32]> Fold02
 3 <split [287/32]> Fold03
 4 <split [287/32]> Fold04
 5 <split [287/32]> Fold05
 6 <split [287/32]> Fold06
 7 <split [287/32]> Fold07
 8 <split [287/32]> Fold08
 9 <split [287/32]> Fold09
10 <split [288/31]> Fold10

Fit cross-validation

classtree_fit <- classtree_wf %>%
  tune_grid(
    resamples = folds,
    grid = classtree_grid,
    metrics = metric_set(accuracy, roc_auc)
    )
classtree_fit
# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics             .notes          
   <list>           <chr>  <list>               <list>          
 1 <split [287/32]> Fold01 <tibble [1,000 × 6]> <tibble [0 × 3]>
 2 <split [287/32]> Fold02 <tibble [1,000 × 6]> <tibble [0 × 3]>
 3 <split [287/32]> Fold03 <tibble [1,000 × 6]> <tibble [0 × 3]>
 4 <split [287/32]> Fold04 <tibble [1,000 × 6]> <tibble [0 × 3]>
 5 <split [287/32]> Fold05 <tibble [1,000 × 6]> <tibble [0 × 3]>
 6 <split [287/32]> Fold06 <tibble [1,000 × 6]> <tibble [0 × 3]>
 7 <split [287/32]> Fold07 <tibble [1,000 × 6]> <tibble [0 × 3]>
 8 <split [287/32]> Fold08 <tibble [1,000 × 6]> <tibble [0 × 3]>
 9 <split [287/32]> Fold09 <tibble [1,000 × 6]> <tibble [0 × 3]>
10 <split [288/31]> Fold10 <tibble [1,000 × 6]> <tibble [0 × 3]>

Visualize CV results

classtree_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "roc_auc") %>%
  mutate(tree_depth = as.factor(tree_depth)) %>%
  ggplot(mapping = aes(x = cost_complexity, y = mean, color = tree_depth)) +
  geom_point() + 
  geom_line() + 
  labs(x = "Cost Complexity", y = "CV ROC AUC", color = "tree_depth") 
# A tibble: 1,000 × 8
   cost_complexity tree_depth .metric  .estimator  mean     n std_err
             <dbl>      <int> <chr>    <chr>      <dbl> <int>   <dbl>
 1        1   e-10          1 accuracy binary     0.724    10  0.0237
 2        1   e-10          1 roc_auc  binary     0.681    10  0.0217
 3        1.22e-10          1 accuracy binary     0.724    10  0.0237
 4        1.22e-10          1 roc_auc  binary     0.681    10  0.0217
 5        1.48e-10          1 accuracy binary     0.724    10  0.0237
 6        1.48e-10          1 roc_auc  binary     0.681    10  0.0217
 7        1.81e-10          1 accuracy binary     0.724    10  0.0237
 8        1.81e-10          1 roc_auc  binary     0.681    10  0.0217
 9        2.21e-10          1 accuracy binary     0.724    10  0.0237
10        2.21e-10          1 roc_auc  binary     0.681    10  0.0217
   .config               
   <chr>                 
 1 Preprocessor1_Model001
 2 Preprocessor1_Model001
 3 Preprocessor1_Model002
 4 Preprocessor1_Model002
 5 Preprocessor1_Model003
 6 Preprocessor1_Model003
 7 Preprocessor1_Model004
 8 Preprocessor1_Model004
 9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 990 more rows

Finalize classification tree model

classtree_fit %>%
  show_best("roc_auc")
# A tibble: 5 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1        1.75e- 2          7 roc_auc binary     0.764    10  0.0248 Preprocesso…
2        1   e-10         10 roc_auc binary     0.761    10  0.0214 Preprocesso…
3        1.22e-10         10 roc_auc binary     0.761    10  0.0214 Preprocesso…
4        1.48e-10         10 roc_auc binary     0.761    10  0.0214 Preprocesso…
5        1.81e-10         10 roc_auc binary     0.761    10  0.0214 Preprocesso…
best_classtree <- classtree_fit %>%
  select_best("roc_auc")
best_classtree
# A tibble: 1 × 3
  cost_complexity tree_depth .config               
            <dbl>      <int> <chr>                 
1          0.0175          7 Preprocessor1_Model397

Final workflow

final_classtree_wf <- classtree_wf %>%
  finalize_workflow(best_classtree)

Fit the whole training set, then predict the test cases

final_classtree_fit <- 
  final_classtree_wf %>%
  last_fit(data_split)

# Test metrics
final_classtree_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.728 Preprocessor1_Model1
2 roc_auc  binary         0.639 Preprocessor1_Model1
library(rpart.plot)
final_classtree <- extract_workflow(final_classtree_fit)

Visulize the final classification tree

final_classtree %>%
  extract_fit_engine() %>%
  rpart.plot(roundint = FALSE)

Variables importance

library(vip)

final_classtree %>% 
  extract_fit_parsnip() %>% 
  vip()

Summary: The final classification tree model has depth of 7 and cost complexity of 0.0175. The 3 most important variables are ShelveLoc_Good, Price and ComPrice, which means that wether the shelving location is good, the price of the car seat, the price of the competition car seat are the most important predictors to predict if Sales > 8 or not. The model estimate new Sales_bi catergory by the majority class in the terminal nodes(leaves) and has a very good performance with accuracy of 0.81 and ROC AUC of 0.88.

Train random forest

Random forest model

rf_mod <- 
  rand_forest(
    mode = "classification",
    mtry = tune(),
    trees = tune()
  ) %>% 
  set_engine("ranger")
rf_mod
Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = tune()

Computational engine: ranger 

Set workflow

rf_wf <- workflow() %>%
  add_recipe(class_rec) %>%
  add_model(rf_mod)
rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = tune()

Computational engine: ranger 

Tuning grid

rf_grid <- grid_regular(
  trees(range = c(100L, 500L)),
  mtry(range = c(1L, 5L)),
  levels = c(10, 5)
  )

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Carseats_train, v = 10)
folds
#  10-fold cross-validation 
# A tibble: 10 × 2
   splits           id    
   <list>           <chr> 
 1 <split [287/32]> Fold01
 2 <split [287/32]> Fold02
 3 <split [287/32]> Fold03
 4 <split [287/32]> Fold04
 5 <split [287/32]> Fold05
 6 <split [287/32]> Fold06
 7 <split [287/32]> Fold07
 8 <split [287/32]> Fold08
 9 <split [287/32]> Fold09
10 <split [288/31]> Fold10

Fit cross-validation

rf_fit <- rf_wf %>%
  tune_grid(
    resamples = folds,
    grid = rf_grid,
    metrics = metric_set(accuracy, roc_auc)
    )

Visualize CV results

rf_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "roc_auc") %>%
  mutate(mtry = as.factor(mtry)) %>%
  ggplot(mapping = aes(x = trees, y = mean, color = mtry)) +
  # geom_point() + 
  geom_line() + 
  labs(x = "Num. of Trees", y = "CV ROC AUC")
# A tibble: 100 × 8
    mtry trees .metric  .estimator  mean     n std_err .config              
   <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
 1     1   100 accuracy binary     0.787    10  0.0213 Preprocessor1_Model01
 2     1   100 roc_auc  binary     0.849    10  0.0242 Preprocessor1_Model01
 3     1   144 accuracy binary     0.777    10  0.0216 Preprocessor1_Model02
 4     1   144 roc_auc  binary     0.848    10  0.0216 Preprocessor1_Model02
 5     1   188 accuracy binary     0.812    10  0.0244 Preprocessor1_Model03
 6     1   188 roc_auc  binary     0.857    10  0.0209 Preprocessor1_Model03
 7     1   233 accuracy binary     0.790    10  0.0235 Preprocessor1_Model04
 8     1   233 roc_auc  binary     0.859    10  0.0241 Preprocessor1_Model04
 9     1   277 accuracy binary     0.793    10  0.0259 Preprocessor1_Model05
10     1   277 roc_auc  binary     0.861    10  0.0203 Preprocessor1_Model05
# ℹ 90 more rows

Finalize random forest model

rf_fit %>%
  show_best("roc_auc")
# A tibble: 5 × 8
   mtry trees .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     4   233 roc_auc binary     0.892    10  0.0180 Preprocessor1_Model34
2     5   188 roc_auc binary     0.891    10  0.0180 Preprocessor1_Model43
3     4   188 roc_auc binary     0.889    10  0.0175 Preprocessor1_Model33
4     3   144 roc_auc binary     0.889    10  0.0190 Preprocessor1_Model22
5     3   500 roc_auc binary     0.888    10  0.0197 Preprocessor1_Model30
best_rf <- rf_fit %>%
  select_best("roc_auc")
best_rf
# A tibble: 1 × 3
   mtry trees .config              
  <int> <int> <chr>                
1     4   233 Preprocessor1_Model34

Final workflow

final_rf_wf <- rf_wf %>%
  finalize_workflow(best_rf)
final_rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = 4
  trees = 233

Computational engine: ranger 

Fit the whole training set, then predict the test cases

final_rf_fit <- 
  final_rf_wf %>%
  last_fit(data_split)

# Test metrics
final_rf_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.802 Preprocessor1_Model1
2 roc_auc  binary         0.877 Preprocessor1_Model1

Summary: Random forest algorithm grow multiple decision trees, the final random forest model has 411 trees and 5 predictors randomly sampled in each split. Random forest estimate new Sales_bi by the majority vote across all trees in classification problem and has a good performance with accuracy of 0.80 and ROC AUC of 0.88.

Train gradient boosting Gradient boosting model

gb_mod <- 
  boost_tree(
    mode = "classification",
    trees = 1000, 
    tree_depth = tune(),
    learn_rate = tune()
  ) %>% 
  set_engine("xgboost")
gb_mod
Boosted Tree Model Specification (classification)

Main Arguments:
  trees = 1000
  tree_depth = tune()
  learn_rate = tune()

Computational engine: xgboost 

Set workflow

gb_wf <- workflow() %>%
  add_recipe(class_rec) %>%
  add_model(gb_mod)
gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)

Main Arguments:
  trees = 1000
  tree_depth = tune()
  learn_rate = tune()

Computational engine: xgboost 

Tuning grid

gb_grid <- grid_regular(
  tree_depth(range = c(1L, 5L)),
  learn_rate(range = c(-5, 0), trans = log10_trans()),
  levels = c(5, 20)
  )
gb_grid
# A tibble: 100 × 2
   tree_depth learn_rate
        <int>      <dbl>
 1          1  0.00001  
 2          2  0.00001  
 3          3  0.00001  
 4          4  0.00001  
 5          5  0.00001  
 6          1  0.0000183
 7          2  0.0000183
 8          3  0.0000183
 9          4  0.0000183
10          5  0.0000183
# ℹ 90 more rows

10-fold Cross-validation

set.seed(123)

folds <- vfold_cv(Carseats_train, v = 10)

Fit cross-validation

gb_fit <- gb_wf %>%
  tune_grid(
    resamples = folds,
    grid = gb_grid,
    metrics = metric_set(accuracy, roc_auc)
    )

Visualize CV results

gb_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "roc_auc") %>%
  mutate(tree_depth = as.factor(tree_depth)) %>%
  ggplot(mapping = aes(x = learn_rate, y = mean, color = tree_depth)) +
  geom_point() + 
  geom_line() + 
  labs(x = "Learning Rate", y = "CV ROC AUC", color = "tree_depth") 
# A tibble: 200 × 8
   tree_depth learn_rate .metric  .estimator  mean     n std_err
        <int>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl>
 1          1    0.00001 accuracy binary     0.724    10  0.0237
 2          1    0.00001 roc_auc  binary     0.681    10  0.0217
 3          2    0.00001 accuracy binary     0.721    10  0.0248
 4          2    0.00001 roc_auc  binary     0.740    10  0.0227
 5          3    0.00001 accuracy binary     0.702    10  0.0222
 6          3    0.00001 roc_auc  binary     0.722    10  0.0249
 7          4    0.00001 accuracy binary     0.736    10  0.0234
 8          4    0.00001 roc_auc  binary     0.767    10  0.0249
 9          5    0.00001 accuracy binary     0.746    10  0.0272
10          5    0.00001 roc_auc  binary     0.780    10  0.0200
   .config               
   <chr>                 
 1 Preprocessor1_Model001
 2 Preprocessor1_Model001
 3 Preprocessor1_Model002
 4 Preprocessor1_Model002
 5 Preprocessor1_Model003
 6 Preprocessor1_Model003
 7 Preprocessor1_Model004
 8 Preprocessor1_Model004
 9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 190 more rows

Finalize gradient boosting model

gb_fit %>%
  show_best("roc_auc")
# A tibble: 5 × 8
  tree_depth learn_rate .metric .estimator  mean     n std_err .config          
       <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
1          1     0.0886 roc_auc binary     0.932    10  0.0163 Preprocessor1_Mo…
2          1     0.0483 roc_auc binary     0.929    10  0.0176 Preprocessor1_Mo…
3          1     0.162  roc_auc binary     0.928    10  0.0168 Preprocessor1_Mo…
4          2     0.0264 roc_auc binary     0.925    10  0.0150 Preprocessor1_Mo…
5          2     0.0483 roc_auc binary     0.925    10  0.0153 Preprocessor1_Mo…
best_gb <- gb_fit %>%
  select_best("roc_auc")
best_gb
# A tibble: 1 × 3
  tree_depth learn_rate .config               
       <int>      <dbl> <chr>                 
1          1     0.0886 Preprocessor1_Model076

Final workflow

final_gb_wf <- gb_wf %>%
  finalize_workflow(best_gb)

final_gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)

Main Arguments:
  trees = 1000
  tree_depth = 1
  learn_rate = 0.0885866790410082

Computational engine: xgboost 

Fit the whole training set, then predict the test cases

final_gb_fit <- 
  final_gb_wf %>%
  last_fit(data_split)

# Test metrics
final_gb_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.877 Preprocessor1_Model1
2 roc_auc  binary         0.946 Preprocessor1_Model1

Summary: Gradient Boosting builds trees in a sequential manner, where each new tree aims to correct the errors made by the previous ones. The final gradient boosting model has 1000 trees, tree depth of 1, and 0.0886 learning rate. The model estimates new Sales_bi catergory by adding trees that predict the residuals or errors of prior trees and it has the best performance with accuracy of 0.88 and ROC AUC of 0.95 in test set.

Compare Performance

final_classtree_fit %>% 
  collect_metrics() %>%
  mutate(model = "Classification Tree") %>%
  bind_rows(
    final_rf_fit %>% collect_metrics() %>%
      mutate(model = "Random Forest"),
    final_gb_fit %>% collect_metrics() %>%
      mutate(model = "Gradient Boosting")
  ) %>%
  select(model, .metric, .estimate) %>%
  spread(.metric, .estimate)
# A tibble: 3 × 3
  model               accuracy roc_auc
  <chr>                  <dbl>   <dbl>
1 Classification Tree    0.728   0.639
2 Gradient Boosting      0.877   0.946
3 Random Forest          0.802   0.877